#Dennis Moskov, Master Thesis
#Partial dependence RF
#conversion, selectivity and yield
#using "randomForest" package

#install.packages("randomForest")
#library(randomForest)

#randomly shuffle the data
set.seed(42)                      # seed for reproducibility
DBf<-DB[sample(nrow(DB)),]

#initiate possible results
results<-rbind(c("Conversion","Selectivity","Yield"),c("X.MeOH","S.MeOH","Y.MeOH"),c(30,31,32))
#results<-rbind(c("Conversion","Selectivity","Yield"),c("X.MeOH","S.MeOH","Y.MeOH"),c(15,16,17))    #for clustered data base
outcome<-matrix(,nrow=nrow(DBf),ncol=7)
colnames(outcome)<-c("fitted conversion","fitted selectivity","fitted Yield","observed conversion","observed selectivity","observed yield","clculated fitted yield")

#loop through different outcomes
for (r in 1:3) {

#use desired outcome
useDB<-DBf[-c(1,as.numeric(results[3,-r]))]

#find best number of used variables
nv <- tuneRF(useDB[,-length(useDB)], useDB[,length(useDB)],stepFactor=1.5, improve=0.005,ntreeTry=500)
bestnv<-nv[which(grepl(min(nv[,2]), nv[,2]))]

nt<-500

#fit a random forest
fit<-randomForest(useDB[,-length(useDB)],useDB[,length(useDB)],mtry=bestnv,importance=TRUE,ntree=nt)



#safe partial dependencies to files
pdf(paste("Partial Dependence for ",results[1,r],".pdf"))
op <- par(mfrow=c(4, 7))
for (i in seq_along(impvar)) {
	partialPlot(fit, DBf, impvar[i], xlab=impvar[i],main="")
}
par(op)
dev.off()


#imp <- importance(fit)
#impvar <- rownames(imp)[order(imp[, 1], decreasing=TRUE)]

for (i in seq_along(impvar)) {
	png(filename=paste(i,"Partial Dependence on ", impvar[i],"for ",results[1,r],".png"))
	partialPlot(fit, DBf, impvar[i], xlab=impvar[i],main=paste("Partial Dependence on", impvar[i]))	
	dev.off()
}


}













